In [1]:
import warnings
warnings.filterwarnings("ignore")

import os

import pandas as pd
import numpy as np
import plotly.express as px

import missingno as msno
In [2]:
#################################################################
##### Setting pandas display options for pretty print
#################################################################

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)
pd.set_option('display.colheader_justify', 'center')
pd.set_option('display.precision', 3)
pd.set_option('display.max_colwidth', 100)

Constant parameters¶

In [3]:
# COVID data URL picked from https://github.com/owid/covid-19-data/tree/master/public/data
COVID_DATA_URL = "https://covid.ourworldindata.org/data/owid-covid-data.csv"
COVID_DATA_FILE_NAME = "owid-covid-data.csv"

COVID_DATA_DICTIONARY_URL = "https://covid.ourworldindata.org/data/owid-covid-codebook.csv"
COVID_DATA_DICTIONARY_FILE_NAME = "owid-covid-codebook.csv"

RESULTS_PATH = "EDS_Covid19_Exp_results"

DATA_FOLDER = "data"

Generated Parameters¶

In [4]:
covid_data_full_file_path = os.path.join(RESULTS_PATH, 
                                         DATA_FOLDER, 
                                         COVID_DATA_FILE_NAME)

covid_data_dictionary_full_file_path = os.path.join(RESULTS_PATH, 
                                                    DATA_FOLDER, 
                                                    COVID_DATA_DICTIONARY_FILE_NAME)
In [5]:
# Creating experiment folders as required
if not os.path.isdir(RESULTS_PATH):
    os.makedirs(RESULTS_PATH)
    
if not os.path.isdir(os.path.join(RESULTS_PATH, 
                                  DATA_FOLDER)):
    os.makedirs(os.path.join(RESULTS_PATH, 
                             DATA_FOLDER))

Fetching Data¶

In [6]:
#################################################################
##### Covid data 
#################################################################

if os.path.isfile(covid_data_full_file_path):
    print("Local data file available:", covid_data_full_file_path)
    df_full_covid_data = pd.read_csv(covid_data_full_file_path, 
                                     sep=",", 
                                     header=0)
    print("Data load complete.")
else:
    print("No local data file found. Downloading from URL:", COVID_DATA_URL)
    df_full_covid_data = pd.read_csv(COVID_DATA_URL, 
                                     sep=",", 
                                     header=0)
    df_full_covid_data.to_csv(covid_data_full_file_path, 
                              sep=",", 
                              header=True, 
                              index=False)
    print("Data downloaded and saved to:", covid_data_full_file_path)

#################################################################
##### Covid data dictionary
#################################################################

if os.path.isfile(covid_data_dictionary_full_file_path):
    print("\nLocal dictionary file available:", covid_data_dictionary_full_file_path)
    df_covid_data_dict = pd.read_csv(covid_data_dictionary_full_file_path, 
                                     sep=",", 
                                     header=0)
    print("Dictionary load complete.")
else:
    print("\nNo local dictionary file found. Downloading from URL:", COVID_DATA_DICTIONARY_URL)
    df_covid_data_dict = pd.read_csv(COVID_DATA_DICTIONARY_URL, 
                                     sep=",", 
                                     header=0)
    df_covid_data_dict.to_csv(covid_data_dictionary_full_file_path, 
                              sep=",", 
                              header=True, 
                              index=False)
    print("Dictionary downloaded and saved to:", covid_data_dictionary_full_file_path)
No local data file found. Downloading from URL: https://covid.ourworldindata.org/data/owid-covid-data.csv
Data downloaded and saved to: EDS_Covid19_Exp_results\data\owid-covid-data.csv

No local dictionary file found. Downloading from URL: https://covid.ourworldindata.org/data/owid-covid-codebook.csv
Dictionary downloaded and saved to: EDS_Covid19_Exp_results\data\owid-covid-codebook.csv
In [7]:
#################################################################
##### codebook - contains details of every column specified by data provider
#################################################################

df_covid_data_dict[['column', 'description']]
Out[7]:
column description
0 iso_code ISO 3166-1 alpha-3 – three-letter country codes
1 continent Continent of the geographical location
2 location Geographical location
3 date Date of observation
4 total_cases Total confirmed cases of COVID-19. Counts can include probable cases, where reported.
5 new_cases New confirmed cases of COVID-19. Counts can include probable cases, where reported. In rare case...
6 new_cases_smoothed New confirmed cases of COVID-19 (7-day smoothed). Counts can include probable cases, where repor...
7 total_deaths Total deaths attributed to COVID-19. Counts can include probable deaths, where reported.
8 new_deaths New deaths attributed to COVID-19. Counts can include probable deaths, where reported. In rare c...
9 new_deaths_smoothed New deaths attributed to COVID-19 (7-day smoothed). Counts can include probable deaths, where re...
10 total_cases_per_million Total confirmed cases of COVID-19 per 1,000,000 people. Counts can include probable cases, where...
11 new_cases_per_million New confirmed cases of COVID-19 per 1,000,000 people. Counts can include probable cases, where r...
12 new_cases_smoothed_per_million New confirmed cases of COVID-19 (7-day smoothed) per 1,000,000 people. Counts can include probab...
13 total_deaths_per_million Total deaths attributed to COVID-19 per 1,000,000 people. Counts can include probable deaths, wh...
14 new_deaths_per_million New deaths attributed to COVID-19 per 1,000,000 people. Counts can include probable deaths, wher...
15 new_deaths_smoothed_per_million New deaths attributed to COVID-19 (7-day smoothed) per 1,000,000 people. Counts can include prob...
16 reproduction_rate Real-time estimate of the effective reproduction rate (R) of COVID-19. See https://github.com/cr...
17 icu_patients Number of COVID-19 patients in intensive care units (ICUs) on a given day
18 icu_patients_per_million Number of COVID-19 patients in intensive care units (ICUs) on a given day per 1,000,000 people
19 hosp_patients Number of COVID-19 patients in hospital on a given day
20 hosp_patients_per_million Number of COVID-19 patients in hospital on a given day per 1,000,000 people
21 weekly_icu_admissions Number of COVID-19 patients newly admitted to intensive care units (ICUs) in a given week (repor...
22 weekly_icu_admissions_per_million Number of COVID-19 patients newly admitted to intensive care units (ICUs) in a given week per 1,...
23 weekly_hosp_admissions Number of COVID-19 patients newly admitted to hospitals in a given week (reporting date and the ...
24 weekly_hosp_admissions_per_million Number of COVID-19 patients newly admitted to hospitals in a given week per 1,000,000 people (re...
25 total_tests Total tests for COVID-19
26 new_tests New tests for COVID-19 (only calculated for consecutive days)
27 total_tests_per_thousand Total tests for COVID-19 per 1,000 people
28 new_tests_per_thousand New tests for COVID-19 per 1,000 people
29 new_tests_smoothed New tests for COVID-19 (7-day smoothed). For countries that don't report testing data on a daily...
30 new_tests_smoothed_per_thousand New tests for COVID-19 (7-day smoothed) per 1,000 people
31 positive_rate The share of COVID-19 tests that are positive, given as a rolling 7-day average (this is the inv...
32 tests_per_case Tests conducted per new confirmed case of COVID-19, given as a rolling 7-day average (this is th...
33 tests_units Units used by the location to report its testing data
34 total_vaccinations Total number of COVID-19 vaccination doses administered
35 people_vaccinated Total number of people who received at least one vaccine dose
36 people_fully_vaccinated Total number of people who received all doses prescribed by the initial vaccination protocol
37 total_boosters Total number of COVID-19 vaccination booster doses administered (doses administered beyond the n...
38 new_vaccinations New COVID-19 vaccination doses administered (only calculated for consecutive days)
39 new_vaccinations_smoothed New COVID-19 vaccination doses administered (7-day smoothed). For countries that don't report va...
40 total_vaccinations_per_hundred Total number of COVID-19 vaccination doses administered per 100 people in the total population
41 people_vaccinated_per_hundred Total number of people who received at least one vaccine dose per 100 people in the total popula...
42 people_fully_vaccinated_per_hundred Total number of people who received all doses prescribed by the initial vaccination protocol per...
43 total_boosters_per_hundred Total number of COVID-19 vaccination booster doses administered per 100 people in the total popu...
44 new_vaccinations_smoothed_per_million New COVID-19 vaccination doses administered (7-day smoothed) per 1,000,000 people in the total p...
45 new_people_vaccinated_smoothed Daily number of people receiving their first vaccine dose (7-day smoothed)
46 new_people_vaccinated_smoothed_per_hundred Daily number of people receiving their first vaccine dose (7-day smoothed) per 100 people in the...
47 stringency_index Government Response Stringency Index: composite measure based on 9 response indicators including...
48 population Population (latest available values). See https://github.com/owid/covid-19-data/blob/master/scri...
49 population_density Number of people divided by land area, measured in square kilometers, most recent year available
50 median_age Median age of the population, UN projection for 2020
51 aged_65_older Share of the population that is 65 years and older, most recent year available
52 aged_70_older Share of the population that is 70 years and older in 2015
53 gdp_per_capita Gross domestic product at purchasing power parity (constant 2011 international dollars), most re...
54 extreme_poverty Share of the population living in extreme poverty, most recent year available since 2010
55 cardiovasc_death_rate Death rate from cardiovascular disease in 2017 (annual number of deaths per 100,000 people)
56 diabetes_prevalence Diabetes prevalence (% of population aged 20 to 79) in 2017
57 female_smokers Share of women who smoke, most recent year available
58 male_smokers Share of men who smoke, most recent year available
59 handwashing_facilities Share of the population with basic handwashing facilities on premises, most recent year available
60 hospital_beds_per_thousand Hospital beds per 1,000 people, most recent year available since 2010
61 life_expectancy Life expectancy at birth in 2019
62 human_development_index A composite index measuring average achievement in three basic dimensions of human development—a...
63 excess_mortality Percentage difference between the reported number of weekly or monthly deaths in 2020–2021 and t...
64 excess_mortality_cumulative Percentage difference between the cumulative number of deaths since 1 January 2020 and the cumul...
65 excess_mortality_cumulative_absolute Cumulative difference between the reported number of deaths since 1 January 2020 and the project...
66 excess_mortality_cumulative_per_million Cumulative difference between the reported number of deaths since 1 January 2020 and the project...

Missing data analysis¶

In [8]:
df_full_covid_data.shape
Out[8]:
(197003, 67)
In [9]:
# Required fields for Delivery 1 and 2
required_fields = ['location', 'date',
                   'total_cases', 'population', 
                   'people_vaccinated', 'people_fully_vaccinated', 'total_boosters']

Filter out continental and world data, keep only country specific data¶

In [10]:
print('Non-country locations:\n', df_full_covid_data[df_full_covid_data['continent'].isnull()]['location'].unique())
Non-country locations:
 ['Africa' 'Asia' 'Europe' 'European Union' 'High income' 'International'
 'Low income' 'Lower middle income' 'North America' 'Oceania'
 'South America' 'Upper middle income' 'World']
In [11]:
df_full_covid_data = df_full_covid_data[df_full_covid_data['continent'].notnull()].reset_index(drop=True)

Bar chart representing relative completion of the data columns:¶

In [12]:
msno.bar(df_full_covid_data)
Out[12]:
<AxesSubplot:>
In [13]:
msno.bar(df_full_covid_data[required_fields])
Out[13]:
<AxesSubplot:>

Nullity matrix, similar to the bar chart, but spatially represents the missing information in the dataframe:¶

In [14]:
msno.matrix(df_full_covid_data)
Out[14]:
<AxesSubplot:>
In [15]:
msno.matrix(df_full_covid_data[required_fields])
Out[15]:
<AxesSubplot:>

Dendogram to visualize the correlation of completion between variables:¶

In [16]:
msno.dendrogram(df_full_covid_data)
Out[16]:
<AxesSubplot:>
In [17]:
msno.dendrogram(df_full_covid_data[required_fields])
Out[17]:
<AxesSubplot:>

Analyzing missing data of vaccinations by country¶

In [18]:
df_missing_vaccination_by_country = df_full_covid_data[[required_fields[i] for i in [0, -3, -2, -1]]].drop('location', 
                                                                                                           axis=1).isna().groupby(df_full_covid_data.location).sum().reset_index()

df_missing_vaccination_by_country['location_records'] = df_full_covid_data['location'].value_counts().reset_index().sort_values(by='index')['location'].reset_index(drop=True)

df_missing_vaccination_by_country['people_vaccinated'] = df_missing_vaccination_by_country['people_vaccinated']/df_missing_vaccination_by_country['location_records']
df_missing_vaccination_by_country['people_fully_vaccinated'] = df_missing_vaccination_by_country['people_fully_vaccinated']/df_missing_vaccination_by_country['location_records']
df_missing_vaccination_by_country['total_boosters'] = df_missing_vaccination_by_country['total_boosters']/df_missing_vaccination_by_country['location_records']

df_missing_vaccination_by_country['mean_total_missing'] = (df_missing_vaccination_by_country['people_vaccinated'] + df_missing_vaccination_by_country['people_fully_vaccinated'] + df_missing_vaccination_by_country['total_boosters']) / 3
In [19]:
fig = px.bar(df_missing_vaccination_by_country, x='location', y='mean_total_missing')
fig.show()
In [20]:
df_missing_vaccination_by_country.sort_values(by='mean_total_missing', ascending=True).head(10)
Out[20]:
location people_vaccinated people_fully_vaccinated total_boosters location_records mean_total_missing
111 Latvia 0.349 0.365 0.372 852 0.362
198 Switzerland 0.366 0.366 0.377 853 0.370
218 United States 0.373 0.373 0.400 887 0.382
63 Estonia 0.382 0.382 0.392 875 0.385
117 Lithuania 0.364 0.385 0.426 849 0.392
153 Norway 0.342 0.374 0.471 853 0.396
70 France 0.385 0.386 0.419 885 0.397
54 Denmark 0.365 0.381 0.445 876 0.397
7 Argentina 0.400 0.400 0.400 908 0.400
35 Canada 0.418 0.388 0.438 886 0.415

Missing Data Analysis Insights:¶

  • Total 197003 records in data
  • Missing values in many columns. Required fields for Delivery 1 and 2 have missing values in:
    • total_cases, population, people_vaccinated, people_fully_vaccinated, total_boosters
  • Missing values are significant in vaccination data
    • Missing values often significant for many countries/region
    • Using the mean missing information from the three vaccination measures (people_vaccinated people_fully_vaccinated total_boosters), top 3 countries with least missing information selected
      • Latvia, Switzerland, United States selected in current report
  • From nullity matrix and dendogram, we can see that missing values of columns can occur in pairs (correlated)
In [21]:
country_list = list(df_missing_vaccination_by_country.sort_values(by='mean_total_missing', ascending=True)['location'].head(3))
In [22]:
df_3country_covid_data = df_full_covid_data[df_full_covid_data['location'].isin(country_list)][required_fields].reset_index(drop=True)

Missing Value Imputation¶

  • Interpolating using pandas default function, in both direction to impute all missing values.
    • Executed separately for each country
In [23]:
impute_column_list = ['total_cases', 'population', 
                      'people_vaccinated', 'people_fully_vaccinated', 'total_boosters']
In [24]:
msno.matrix(df_3country_covid_data)
Out[24]:
<AxesSubplot:>
In [25]:
df_3country_covid_data_imputed = None

for current_country in list(df_3country_covid_data['location'].unique()):
    
    temp_df_current_country_data = df_3country_covid_data[df_3country_covid_data['location'] == current_country]
    temp_df_current_country_data = temp_df_current_country_data.sort_values(by='date', ascending=True)
    
    for col in impute_column_list:
        temp_df_current_country_data[col] = temp_df_current_country_data[col].interpolate(limit_direction="both")
    
    if df_3country_covid_data_imputed is None:
        df_3country_covid_data_imputed = temp_df_current_country_data
    else:
        df_3country_covid_data_imputed = pd.concat([df_3country_covid_data_imputed, temp_df_current_country_data],
                                                   ignore_index=True
                                                  )
In [26]:
msno.matrix(df_3country_covid_data_imputed)
Out[26]:
<AxesSubplot:>

Exploratory Data Analysis¶

Delivery 1: The relative cases overtime of Covid infectors (absolute Covid cases/population size)¶

In [27]:
df_3country_covid_data_imputed['infector_percentage'] = df_3country_covid_data_imputed['total_cases']/df_3country_covid_data_imputed['population']
In [28]:
fig = px.line(df_3country_covid_data_imputed, 
              x='date', y='infector_percentage', 
              color='location',
              # title = 'The relative cases overtime of Covid infectors'
             )
fig.update_layout(
    yaxis_title="Total cases as percentage of population",
    xaxis_title="Timeline",
    title = {
        'text':'The total affected rate (percentage of the population) over time',
        'x':0.45,
        'xanchor': 'center',
        'yanchor': 'top'
    },
    legend = {
        'title':{'text':'Location'}
    }
)
fig.show(renderer='notebook')

Delivery 2: The vaccination rate (percentage of the population) over time¶

In [29]:
df_3country_covid_data_imputed['vaccination_dose1_percentage'] = df_3country_covid_data_imputed['people_vaccinated']/df_3country_covid_data_imputed['population']
df_3country_covid_data_imputed['vaccination_dose2_percentage'] = df_3country_covid_data_imputed['people_fully_vaccinated']/df_3country_covid_data_imputed['population']
df_3country_covid_data_imputed['vaccination_dose3_percentage'] = df_3country_covid_data_imputed['total_boosters']/df_3country_covid_data_imputed['population']

df_3country_covid_data_imputed_Transformed = None

for dose in range(1,4):
    
    col_list = ['location', 'date', 'vaccination_dose{}_percentage'.format(dose)]
    temp_df = df_3country_covid_data_imputed[col_list]
    
    temp_df['location'] = temp_df['location']+', '+str(dose)
    temp_df.columns = ['location, dose', 'date', 'vaccination_percentage']
    
    if df_3country_covid_data_imputed_Transformed is None:
        df_3country_covid_data_imputed_Transformed = temp_df
    else:
        df_3country_covid_data_imputed_Transformed = pd.concat([df_3country_covid_data_imputed_Transformed, temp_df],#
                                                               ignore_index=True
                                                              )
In [30]:
fig = None
fig = px.line(df_3country_covid_data_imputed_Transformed, 
              x='date', y='vaccination_percentage', 
              color='location, dose',
             )
fig.update_layout(
    yaxis_title="Percentage of population vaccinated",
    xaxis_title="Timeline",
    title = {
        'text':'The vaccination rate (percentage of the population) over time',
        'x':0.45,
        'xanchor': 'center',
        'yanchor': 'top'
    },
    legend = {
        'title':{'text':'Location, Dosage'}
    }
)
fig.show(renderer='notebook')
In [ ]: